{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Homework 1: COVID-19 Cases Prediction (Regression)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Objectives:\n",
    "\n",
    "- Solve a regression problem with deep neural networks (DNN).\n",
    "- Understand basic DNN training tips.\n",
    "- Familiarize yourself with PyTorch.\n",
    "\n",
    "If you have any questions, please contact the TAs via TA hours, NTU COOL, or email to mlta-2022-spring@googlegroups.com"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Import Package"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Numerical Operations\n",
    "import math\n",
    "import numpy as np\n",
    "\n",
    "# Reading/Writing Data\n",
    "import pandas as pd\n",
    "import os\n",
    "import csv\n",
    "\n",
    "# For Progress Bar\n",
    "from tqdm import tqdm\n",
    "\n",
    "# Pytorch\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "from torch.utils.data import Dataset, DataLoader, random_split\n",
    "\n",
    "# For plotting learning curve\n",
    "from torch.utils.tensorboard import SummaryWriter"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Some Utility Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def same_seed(seed):\n",
    "    '''Fixes random number generator seeds for reproducibility.'''\n",
    "    torch.backends.cudnn.deterministic = True\n",
    "    torch.backends.cudnn.benchmark = False\n",
    "    np.random.seed(seed)\n",
    "    torch.manual_seed(seed)\n",
    "    if torch.cuda.is_available():\n",
    "        torch.cuda.manual_seed_all(seed)\n",
    "\n",
    "def train_valid_split(data_set, valid_ratio, seed):\n",
    "    '''Split provided training data into training set and validation set'''\n",
    "    valid_set_size = int(valid_ratio * len(data_set))\n",
    "    train_set_size = len(data_set) - valid_set_size\n",
    "    train_set, valid_set = random_split(data_set, [train_set_size, valid_set_size], generator=torch.Generator().manual_seed(seed))\n",
    "    return np.array(train_set), np.array(valid_set)\n",
    "\n",
    "def predict(test_loader, model, device):\n",
    "    model.eval() # Set your model to evaluation mode.\n",
    "    preds = []\n",
    "    for x in tqdm(test_loader):\n",
    "        x = x.to(device)\n",
    "        with torch.no_grad():\n",
    "            pred = model(x)\n",
    "            preds.append(pred.detach().cpu())\n",
    "    preds = torch.cat(preds, dim=0).numpy()\n",
    "    return preds"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "class COVID19Dataset(Dataset):\n",
    "    '''\n",
    "    x: Features.\n",
    "    y: Targets, if none, do prediction.\n",
    "    '''\n",
    "    def __init__(self, x, y=None):\n",
    "        if y is None:\n",
    "            self.y = y\n",
    "        else:\n",
    "            self.y = torch.FloatTensor(y)\n",
    "        self.x = torch.FloatTensor(x)\n",
    "\n",
    "    def __getitem__(self, idx):\n",
    "        if self.y is None:\n",
    "            return self.x[idx]\n",
    "        else:\n",
    "            return self.x[idx], self.y[idx]\n",
    "\n",
    "    def __len__(self):\n",
    "        return len(self.x)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Neural Network Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "class My_Model(nn.Module):\n",
    "    def __init__(self, input_dim):\n",
    "        super(My_Model, self).__init__()\n",
    "        # TODO: modify model's structure, be aware of dimensions.\n",
    "        self.fc1 = nn.Linear(input_dim, 64)\n",
    "        self.fc2 = nn.Linear(64, 16)\n",
    "        self.fc3 = nn.Linear(16, 1)\n",
    "        self.leaky_relu = nn.LeakyReLU()\n",
    "\n",
    "    def forward(self, x):\n",
    "        x = self.leaky_relu(self.fc1(x))\n",
    "        x = self.leaky_relu(self.fc2(x))\n",
    "        x = self.fc3(x)\n",
    "        x = x.squeeze(1)\n",
    "        return x"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Feature Selection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_selection import SelectKBest\n",
    "from sklearn.feature_selection import f_regression\n",
    "\n",
    "# 选择 k 个相关性最强的特征\n",
    "def select(k, x, y):\n",
    "    selector = SelectKBest(score_func=f_regression, k=k)\n",
    "    selector.fit_transform(x, y)\n",
    "\n",
    "    selected_indices = selector.get_support(indices=True)\n",
    "    selected_feature_list = selected_indices.tolist()\n",
    "    print(selected_feature_list)\n",
    "    return selected_feature_list\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def select_feat(train_data, valid_data, test_data, select_all=True):\n",
    "    '''Selects useful features to perform regression'''\n",
    "    y_train, y_valid = train_data[:,-1], valid_data[:,-1]\n",
    "    raw_x_train, raw_x_valid, raw_x_test = train_data[:,:-1], valid_data[:,:-1], test_data\n",
    "\n",
    "    if select_all:\n",
    "        feat_idx = list(range(raw_x_train.shape[1]))\n",
    "    else:\n",
    "        feat_idx = select(17, raw_x_train, y_train) # TODO: Select suitable feature columns.\n",
    "\n",
    "    return raw_x_train[:,feat_idx], raw_x_valid[:,feat_idx], raw_x_test[:,feat_idx], y_train, y_valid"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Training Loop"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def trainer(train_loader, valid_loader, model, config, device):\n",
    "\n",
    "    criterion = nn.MSELoss(reduction='mean') # Define your loss function, do not modify this.\n",
    "\n",
    "    # Define your optimization algorithm.\n",
    "    # TODO: Please check https://pytorch.org/docs/stable/optim.html to get more available algorithms.\n",
    "    # TODO: L2 regularization (optimizer(weight decay...) or implement by your self).\n",
    "    \n",
    "    optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate'], weight_decay=1e-5)\n",
    "    scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer=optimizer, T_0=10, T_mult=2)\n",
    "    #optimizer = torch.optim.SGD(model.parameters(), lr=config['learning_rate'], momentum=0.9)\n",
    "\n",
    "    writer = SummaryWriter() # Writer of tensoboard.\n",
    "\n",
    "    if not os.path.isdir('./models'):\n",
    "        os.mkdir('./models') # Create directory of saving models.\n",
    "\n",
    "    n_epochs, best_loss, step, early_stop_count = config['n_epochs'], math.inf, 0, 0\n",
    "\n",
    "    for epoch in range(n_epochs):\n",
    "        model.train() # Set your model to train mode.\n",
    "        loss_record = []\n",
    "\n",
    "        # tqdm is a package to visualize your training progress.\n",
    "        # train_pbar = tqdm(train_loader, position=0, leave=True)\n",
    "\n",
    "        for x, y in train_loader:\n",
    "            optimizer.zero_grad()               # Set gradient to zero.\n",
    "            x, y = x.to(device), y.to(device)   # Move your data to device.\n",
    "            pred = model(x)\n",
    "            loss = criterion(pred, y)\n",
    "            loss.backward()                     # Compute gradient(backpropagation).\n",
    "            optimizer.step()                    # Update parameters.\n",
    "            step += 1\n",
    "            loss_record.append(loss.detach().item())\n",
    "\n",
    "            # Display current epoch number and loss on tqdm progress bar.\n",
    "            # train_pbar.set_description(f'Epoch [{epoch+1}/{n_epochs}]')\n",
    "            # train_pbar.set_postfix({'loss': loss.detach().item()})\n",
    "\n",
    "        mean_train_loss = sum(loss_record)/len(loss_record)\n",
    "        writer.add_scalar('Loss/train', mean_train_loss, step)\n",
    "        scheduler.step()\n",
    "\n",
    "        model.eval() # Set your model to evaluation mode.\n",
    "        loss_record = []\n",
    "        for x, y in valid_loader:\n",
    "            x, y = x.to(device), y.to(device)\n",
    "            with torch.no_grad():\n",
    "                pred = model(x)\n",
    "                loss = criterion(pred, y)\n",
    "\n",
    "            loss_record.append(loss.item())\n",
    "\n",
    "        mean_valid_loss = sum(loss_record)/len(loss_record)\n",
    "        if epoch % 50 == 0: \n",
    "            print(f'Epoch [{epoch+1}/{n_epochs}]: Train loss: {mean_train_loss:.4f}, Valid loss: {mean_valid_loss:.4f}')\n",
    "        writer.add_scalar('Loss/valid', mean_valid_loss, step)\n",
    "\n",
    "        if mean_valid_loss < best_loss:\n",
    "            best_loss = mean_valid_loss\n",
    "            torch.save(model.state_dict(), config['save_path']) # Save your best model\n",
    "            print('Saving model with loss {:.3f}...'.format(best_loss))\n",
    "            early_stop_count = 0\n",
    "        else:\n",
    "            early_stop_count += 1\n",
    "\n",
    "        if early_stop_count >= config['early_stop']:\n",
    "            print('\\nModel is not improving, so we halt the training session.')\n",
    "            return"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Configurations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
    "config = {\n",
    "    'seed': 5201314,      # Your seed number, you can pick your lucky number. :)\n",
    "    'select_all': False,   # Whether to use all features.\n",
    "    'valid_ratio': 0.2,   # validation_size = train_size * valid_ratio\n",
    "    'n_epochs': 100,     # Number of epochs 10000.\n",
    "    'batch_size': 256,\n",
    "    'learning_rate': 1e-3,\n",
    "    'early_stop': 2000,    # If model has not improved for this many consecutive epochs, stop training.\n",
    "    'save_path': './models/model.ckpt'  # Your model will be saved here.\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## DataLoader"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train_data size: (2160, 118)\n",
      "valid_data size: (539, 118)\n",
      "test_data size: (1078, 117)\n",
      "[40, 41, 53, 56, 57, 69, 72, 73, 85, 87, 88, 89, 101, 102, 103, 104, 105]\n",
      "number of features: 17\n"
     ]
    }
   ],
   "source": [
    "# Set seed for reproducibility\n",
    "same_seed(config['seed'])\n",
    "\n",
    "\n",
    "# train_data size: 2699 x 118 (id + 37 states + 16 features x 5 days)\n",
    "# test_data size: 1078 x 117 (without last day's positive rate)\n",
    "train_data, test_data = pd.read_csv('./datasets/covid.train.csv').values, pd.read_csv('./datasets/covid.test.csv').values\n",
    "train_data, valid_data = train_valid_split(train_data, config['valid_ratio'], config['seed'])\n",
    "\n",
    "# Print out the data size.\n",
    "print(f\"\"\"train_data size: {train_data.shape}\n",
    "valid_data size: {valid_data.shape}\n",
    "test_data size: {test_data.shape}\"\"\")\n",
    "\n",
    "# Select features\n",
    "x_train, x_valid, x_test, y_train, y_valid = select_feat(train_data, valid_data, test_data, config['select_all'])\n",
    "\n",
    "# Print out the number of features.\n",
    "print(f'number of features: {x_train.shape[1]}')\n",
    "\n",
    "train_dataset, valid_dataset, test_dataset = COVID19Dataset(x_train, y_train), \\\n",
    "                                            COVID19Dataset(x_valid, y_valid), \\\n",
    "                                            COVID19Dataset(x_test)\n",
    "\n",
    "# Pytorch data loader loads pytorch dataset into batches.\n",
    "train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)\n",
    "valid_loader = DataLoader(valid_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)\n",
    "test_loader = DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=False, pin_memory=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Start Training!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch [1/10]: Train loss: 114.6726, Valid loss: 74.7515\n",
      "Saving model with loss 74.752...\n",
      "Saving model with loss 31.301...\n",
      "Saving model with loss 11.060...\n",
      "Saving model with loss 9.423...\n",
      "Saving model with loss 6.944...\n",
      "Saving model with loss 5.749...\n",
      "Saving model with loss 5.531...\n"
     ]
    }
   ],
   "source": [
    "model = My_Model(input_dim=x_train.shape[1]).to(device) # put your model and data on the same computation device.\n",
    "trainer(train_loader, valid_loader, model, config, device)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "      <iframe id=\"tensorboard-frame-9a2ec85ed5be226f\" width=\"100%\" height=\"800\" frameborder=\"0\">\n",
       "      </iframe>\n",
       "      <script>\n",
       "        (function() {\n",
       "          const frame = document.getElementById(\"tensorboard-frame-9a2ec85ed5be226f\");\n",
       "          const url = new URL(\"/\", window.location);\n",
       "          const port = 6006;\n",
       "          if (port) {\n",
       "            url.port = port;\n",
       "          }\n",
       "          frame.src = url;\n",
       "        })();\n",
       "      </script>\n",
       "    "
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "%reload_ext tensorboard\n",
    "%tensorboard --logdir=./runs/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████████| 5/5 [00:00<00:00, 413.35it/s]\n"
     ]
    }
   ],
   "source": [
    "def save_pred(preds, file):\n",
    "    ''' Save predictions to specified file '''\n",
    "    with open(file, 'w') as fp:\n",
    "        writer = csv.writer(fp)\n",
    "        writer.writerow(['id', 'tested_positive'])\n",
    "        for i, p in enumerate(preds):\n",
    "            writer.writerow([i, p])\n",
    "\n",
    "model = My_Model(input_dim=x_train.shape[1]).to(device)\n",
    "model.load_state_dict(torch.load(config['save_path']))\n",
    "preds = predict(test_loader, model, device)\n",
    "save_pred(preds, 'pred.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
